Author

Maria Knapczyk

1 Intro

Show and hide code
library(tidymodels)
library(tidyverse)
library(stringr)
library(vip)
library(pdp)
library(DALEX)
library(DALEXtra)
library(bestNormalize)
library(rules)
library(baguette)
library(finetune)
library(doParallel)
library(DT)
library(ggplot2)
library(lubridate)
library(future)

tidymodels_prefer()
Show and hide code
data = read.csv("coffee_shop_sales.csv")
datatable(data)

2 Data

Show and hide code
str(data)
'data.frame':   4998 obs. of  15 variables:
 $ transaction_id       : int  1 2 3 4 5 6 7 8 9 10 ...
 $ transaction_date     : chr  "2025-03-13" "2024-09-26" "2025-03-23" "2025-05-28" ...
 $ transaction_time     : chr  "10:10:19" "13:13:41" "16:17:20" "16:31:54" ...
 $ product_id           : chr  "P302" "P302" "P302" "P201" ...
 $ product_name         : chr  "Croissant" "Croissant" "Croissant" "Green Tea" ...
 $ product_category     : chr  "Bakery" "Bakery" "Bakery" "Tea" ...
 $ size                 : chr  NA NA NA "Small" ...
 $ quantity             : int  1 1 2 2 3 1 1 3 3 3 ...
 $ unit_price           : num  3 3 3 2 3 2.5 2 3.5 2.5 3 ...
 $ total_price          : num  3 3 6 4 9 2.5 2 10.5 7.5 9 ...
 $ payment_method       : chr  "Debit Card" "Credit Card" "Cash" "Credit Card" ...
 $ customer_id          : chr  "C624" "C549" "C857" "C402" ...
 $ loyalty_points_earned: int  6 6 0 8 0 3 7 2 7 9 ...
 $ location             : chr  "Airport" "University" "Airport" "Downtown" ...
 $ barista_id           : chr  "EMP001" "EMP001" "EMP002" "EMP004" ...
Show and hide code
data <- data |> 
  as_tibble() |> 
  janitor::clean_names() |> 

  mutate(
    transaction_date = as.Date(transaction_date),
    transaction_datetime = as.POSIXct(
      paste(transaction_date, transaction_time),
      format="%Y-%m-%d %H:%M:%S"
    )
  ) |> 

  mutate(
    hour = hour(transaction_datetime)
  ) |> 

  mutate(
    wday = wday(transaction_date, label = TRUE, abbr = TRUE),
    day_work = if_else(wday %in% c("Sat", "Sun"), "weekend", "week"),
    day_work = factor(day_work)
  )
Show and hide code
data |> summary()
 transaction_id transaction_date     transaction_time    product_id       
 Min.   :   1   Min.   :2024-08-08   Length:4998        Length:4998       
 1st Qu.:1250   1st Qu.:2024-11-07   Class :character   Class :character  
 Median :2500   Median :2025-02-08   Mode  :character   Mode  :character  
 Mean   :2500   Mean   :2025-02-07                                        
 3rd Qu.:3749   3rd Qu.:2025-05-08                                        
 Max.   :4998   Max.   :2025-08-07                                        
                                                                          
 product_name       product_category       size              quantity    
 Length:4998        Length:4998        Length:4998        Min.   :1.000  
 Class :character   Class :character   Class :character   1st Qu.:1.000  
 Mode  :character   Mode  :character   Mode  :character   Median :2.000  
                                                          Mean   :1.986  
                                                          3rd Qu.:3.000  
                                                          Max.   :3.000  
                                                                         
   unit_price     total_price    payment_method     customer_id       
 Min.   :2.000   Min.   : 2.00   Length:4998        Length:4998       
 1st Qu.:2.500   1st Qu.: 3.50   Class :character   Class :character  
 Median :3.000   Median : 6.00   Mode  :character   Mode  :character  
 Mean   :3.096   Mean   : 6.15                                        
 3rd Qu.:3.500   3rd Qu.: 8.00                                        
 Max.   :4.500   Max.   :13.50                                        
                                                                      
 loyalty_points_earned   location          barista_id       
 Min.   : 0.000        Length:4998        Length:4998       
 1st Qu.: 0.000        Class :character   Class :character  
 Median : 1.000        Mode  :character   Mode  :character  
 Mean   : 3.022                                             
 3rd Qu.: 6.000                                             
 Max.   :10.000                                             
                                                            
 transaction_datetime                  hour        wday        day_work   
 Min.   :2024-08-08 12:15:25.00   Min.   : 8.00   Sun:713   week   :3539  
 1st Qu.:2024-11-07 11:53:53.00   1st Qu.:11.00   Mon:682   weekend:1459  
 Median :2025-02-08 10:53:26.00   Median :14.00   Tue:716                 
 Mean   :2025-02-07 15:31:04.34   Mean   :13.93   Wed:687                 
 3rd Qu.:2025-05-08 20:26:52.75   3rd Qu.:17.00   Thu:770                 
 Max.   :2025-08-07 19:48:16.00   Max.   :20.00   Fri:684                 
                                                  Sat:746                 
Show and hide code
colSums(is.na(data))
       transaction_id      transaction_date      transaction_time 
                    0                     0                     0 
           product_id          product_name      product_category 
                    0                     0                     0 
                 size              quantity            unit_price 
                 1464                     0                     0 
          total_price        payment_method           customer_id 
                    0                     0                  1537 
loyalty_points_earned              location            barista_id 
                    0                     0                     0 
 transaction_datetime                  hour                  wday 
                    0                     0                     0 
             day_work 
                    0 
Show and hide code
cor(data[sapply(data, is.numeric)])
                      transaction_id     quantity   unit_price   total_price
transaction_id           1.000000000 -0.003025046  0.013439873 -0.0021041791
quantity                -0.003025046  1.000000000  0.000410798  0.8658484256
unit_price               0.013439873  0.000410798  1.000000000  0.4625576353
total_price             -0.002104179  0.865848426  0.462557635  1.0000000000
loyalty_points_earned   -0.003518060  0.013822510 -0.011421552  0.0003937413
hour                     0.007342483  0.014040713  0.017121467  0.0188011150
                      loyalty_points_earned         hour
transaction_id                -0.0035180599  0.007342483
quantity                       0.0138225098  0.014040713
unit_price                    -0.0114215517  0.017121467
total_price                    0.0003937413  0.018801115
loyalty_points_earned          1.0000000000 -0.008620883
hour                          -0.0086208833  1.000000000

3 Plots

Show and hide code
ggplot(data, aes(x=total_price)) +
  labs(
    title = "Histogram cen",
    x = "Cena",
    y = "Ilość"
  ) + 
  geom_histogram(binwidth = 1, fill = "cadetblue", color = "black")

Show and hide code
ggplot(data, aes(x=loyalty_points_earned)) + 
  labs(
    title = "Histogram zdobytych punktów lojanościowych",
    x = "Przedziały",
    y = "Ilość"
  ) + 
  geom_histogram(binwidth = 1, fill = "cadetblue", color = "black")

Show and hide code
ggplot(data, aes(x = product_category)) +
  geom_bar(fill = "cadetblue", color = "black") +
  labs(
    title = "Liczba transakcji w poszczególnych kategoriach produktów",
    x = "Kategoria produktu",
    y = "Liczba transakcji"
  ) 

Show and hide code
ggplot(data, aes(x=payment_method)) + geom_bar(fill = "cadetblue", color = "black") +
  labs(
    title = "Liczba transakcji poszczególnych metod płatniczych",
    x = "Metoda płatności",
    y = "Liczba transakcji"
  )

Show and hide code
ggplot(data, aes(x=location)) + geom_bar(fill = "cadetblue", color = "black") +
  labs(
    title = "Liczba transakcji w poszczególnych lokalizacjach",
    x = "Lokalizacja",
    y = "Liczba transakcji"
  )

Show and hide code
hourly_products <- data %>%
  group_by(hour, product_name) %>%
  summarise(transactions = n(), .groups = "drop") %>%
  arrange(hour, desc(transactions)) %>%
  group_by(hour) %>%
  slice_max(transactions, n = 3)


ggplot(hourly_products, aes(x = factor(hour), y = transactions, fill = product_name)) +
  geom_col() +
  labs(
    title = "Najczęściej kupowany produkt w każdej godzinie",
    x = "Godzina",
    y = "Liczba transakcji",
    fill = "Produkt"
  )

4 Formula

Show and hide code
forms <- formula(total_price ~ .)

5 Data split

Show and hide code
set.seed(111)

trans_split <- initial_validation_split(data = data, strata = total_price)

trans_train <- training(trans_split)
trans_test  <- testing(trans_split)
trans_valid <- validation_set(trans_split)

trans_folds <- vfold_cv(trans_train, strata = total_price, v = 10, repeats = 5) # testowo

save(trans_split, trans_train, trans_test, trans_valid, trans_folds,file = "coffesplitted_data.Rdata")
Show and hide code
load("coffesplitted_data.Rdata")

6 Model

Show and hide code
# CART
cart_spec <- decision_tree(
  cost_complexity = tune(),
  min_n = tune()
) %>%
  set_engine("rpart") %>%
  set_mode("regression")

# Random Forest
rf_spec <- rand_forest(
  mtry = tune(),
  min_n = tune(),
  trees = tune()
) %>%
  set_engine("ranger") %>%
  set_mode("regression")

# Cubist
cubist_spec <- cubist_rules(
  committees = tune(),
  neighbors = tune()
) %>%
  set_engine("Cubist") %>%
  set_mode("regression")

# XGBoost
xgb_spec <- boost_tree(
  tree_depth     = tune(),
  learn_rate     = tune(),
  loss_reduction = tune(),
  min_n          = tune(),
  sample_size    = tune(),
  trees          = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

# MARS
mars_spec <- mars(
  num_terms = tune(),
  prod_degree = tune()
) %>%
  set_engine("earth") %>%
  set_mode("regression")

# knn
knn_spec <- nearest_neighbor(
  neighbors = tune()
) %>%
  set_engine("kknn") %>%
  set_mode("regression")

7 Recipe

Show and hide code
basic_rec <- recipe(total_price ~ ., data = trans_train) %>%
  step_mutate(hour = as.numeric(hour)) %>%
  update_role(
    transaction_id, transaction_date, transaction_time,
    product_id, customer_id, barista_id,
    new_role = "ID"
  ) %>%
  step_rm(quantity, unit_price, loyalty_points_earned)

basic_t_rec <- recipe(total_price ~ ., data = trans_train) %>%
  update_role(transaction_id, transaction_date, transaction_time,
              product_id, customer_id, barista_id, new_role = "ID") %>%
  step_mutate(
    transaction_datetime = as.numeric(transaction_datetime),
    hour = as.numeric(hour)
  ) %>%
  step_rm(quantity, unit_price, loyalty_points_earned) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_nominal_predictors())

cubist_rec <- recipe(total_price ~ ., data = trans_train) %>%
  update_role(transaction_id, transaction_date, transaction_time,
              product_id, customer_id, barista_id, new_role = "ID") %>%
  step_mutate(
    transaction_datetime = as.numeric(transaction_datetime),
    hour = as.numeric(hour)
  ) %>%
  step_rm(quantity, unit_price, loyalty_points_earned) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_novel(all_nominal_predictors()) %>%
  step_string2factor(all_nominal_predictors()) %>%
  step_zv(all_predictors())

prep(basic_t_rec, training = trans_train) %>% juice()
# A tibble: 2,997 × 38
   transaction_id transaction_date transaction_time product_id customer_id
            <int> <date>           <chr>            <chr>      <chr>      
 1              1 2025-03-13       10:10:19         P302       C624       
 2              5 2025-01-16       19:17:24         P302       <NA>       
 3              7 2024-09-16       20:51:57         P201       <NA>       
 4              8 2024-11-19       14:17:27         P103       C944       
 5             10 2025-04-09       10:34:23         P302       <NA>       
 6             11 2024-12-21       20:42:07         P102       C656       
 7             14 2024-10-17       19:22:30         P202       C652       
 8             15 2025-02-11       18:53:16         P102       <NA>       
 9             16 2025-05-08       12:16:37         P102       C547       
10             18 2024-09-11       15:27:29         P202       C720       
# ℹ 2,987 more rows
# ℹ 33 more variables: barista_id <chr>, transaction_datetime <dbl>,
#   hour <dbl>, total_price <dbl>, product_name_Cappuccino <dbl>,
#   product_name_Chai.Latte <dbl>, product_name_Croissant <dbl>,
#   product_name_Espresso <dbl>, product_name_Green.Tea <dbl>,
#   product_name_Latte <dbl>, product_name_unknown <dbl>,
#   product_category_Coffee <dbl>, product_category_Tea <dbl>, …
Show and hide code
prep(basic_rec, training = trans_train) |> juice()
# A tibble: 2,997 × 16
   transaction_id transaction_date transaction_time product_id product_name
            <int> <date>           <chr>            <chr>      <fct>       
 1              1 2025-03-13       10:10:19         P302       Croissant   
 2              5 2025-01-16       19:17:24         P302       Croissant   
 3              7 2024-09-16       20:51:57         P201       Green Tea   
 4              8 2024-11-19       14:17:27         P103       Latte       
 5             10 2025-04-09       10:34:23         P302       Croissant   
 6             11 2024-12-21       20:42:07         P102       Cappuccino  
 7             14 2024-10-17       19:22:30         P202       Chai Latte  
 8             15 2025-02-11       18:53:16         P102       Cappuccino  
 9             16 2025-05-08       12:16:37         P102       Cappuccino  
10             18 2024-09-11       15:27:29         P202       Chai Latte  
# ℹ 2,987 more rows
# ℹ 11 more variables: product_category <fct>, size <fct>,
#   payment_method <fct>, customer_id <chr>, location <fct>, barista_id <chr>,
#   transaction_datetime <dttm>, hour <dbl>, wday <ord>, day_work <fct>,
#   total_price <dbl>
Show and hide code
summary(basic_rec) |> knitr::kable()
variable type role source
transaction_id integer, numeric ID original
transaction_date date ID original
transaction_time string , unordered, nominal ID original
product_id string , unordered, nominal ID original
product_name string , unordered, nominal predictor original
product_category string , unordered, nominal predictor original
size string , unordered, nominal predictor original
quantity integer, numeric predictor original
unit_price double , numeric predictor original
payment_method string , unordered, nominal predictor original
customer_id string , unordered, nominal ID original
loyalty_points_earned integer, numeric predictor original
location string , unordered, nominal predictor original
barista_id string , unordered, nominal ID original
transaction_datetime datetime predictor original
hour integer, numeric predictor original
wday ordered, nominal predictor original
day_work factor , unordered, nominal predictor original
total_price double , numeric outcome original

8 Workflow

Show and hide code
a <- workflow_set(
  preproc = list(b = basic_rec),
  models  = list(
    rpart = cart_spec,
    ranger = rf_spec
  )
)

b <- workflow_set(
  preproc = list(t = basic_t_rec),
  models  = list(xgboost = xgb_spec
  )
)

c <- workflow_set(
  preproc = list(cubist = cubist_rec),
  models = list(cubist = cubist_spec)
)

d <- workflow_set(
  preproc = list(transformed = basic_rec),
  models = list(
    mars = mars_spec,
    knn  = knn_spec
  )
)

basic <- bind_rows(a, b, c, d)
basic$wflow_id <- str_sub(basic$wflow_id, start = 3, end = 100)

basic
# A workflow set/tibble: 6 × 4
  wflow_id       info             option    result    
  <chr>          <list>           <list>    <list>    
1 rpart          <tibble [1 × 4]> <opts[0]> <list [0]>
2 ranger         <tibble [1 × 4]> <opts[0]> <list [0]>
3 xgboost        <tibble [1 × 4]> <opts[0]> <list [0]>
4 bist_cubist    <tibble [1 × 4]> <opts[0]> <list [0]>
5 ansformed_mars <tibble [1 × 4]> <opts[0]> <list [0]>
6 ansformed_knn  <tibble [1 × 4]> <opts[0]> <list [0]>

9 Parameters

Show and hide code
#n nrow i p predyktory

cart_param <- cart_spec |>
  extract_parameter_set_dials() |>
  update(
    min_n = min_n(c(5, 30)) # <1% z n
  )


rf_param <- rf_spec |>
  extract_parameter_set_dials() |> 
  update(
    min_n = min_n(c(5, 30)),
    mtry =  mtry(c(4, 9)),   # sqrt i polowa p
    trees = trees(c(100, 500))
  )


xgb_param <- xgb_spec |> 
  extract_parameter_set_dials() |> 
  update(
    min_n = min_n(c(5, 30)), 
    trees = trees(c(100, 500)), 
    tree_depth = tree_depth(c(2,10)),
    learn_rate = learn_rate(c(0.01, 0.3)),
    loss_reduction = loss_reduction(c(0, 5)),
    sample_size = sample_prop(c(0.5, 1))
  )


cubist_param <- cubist_spec |> 
  extract_parameter_set_dials() |> 
  update(
    committees = committees(c(1, 10)),
    neighbors  = neighbors(c(0, 5))
  )


mars_param <- mars_spec |>
  extract_parameter_set_dials() |>
  update(
    num_terms = num_terms(c(2, 30)),
    prod_degree = prod_degree(c(1, 2))
  )


knn_param <- knn_spec |>
  extract_parameter_set_dials() |>
  update(
    neighbors = neighbors(c(3, 15))
  )



basic <- basic |> option_add(param_info = cart_param, id = "rpart")
basic <- basic |> option_add(param_info = rf_param, id = "ranger")
basic <- basic |> option_add(param_info = xgb_param, id = "xgboost")
basic <- basic |> option_add(param_info = cubist_param, id = "cubist")
basic <- basic |> option_add(param_info = mars_param, id = "mars")
basic <- basic |> option_add(param_info = knn_param, id = "knn")


basic
# A workflow set/tibble: 6 × 4
  wflow_id       info             option    result    
  <chr>          <list>           <list>    <list>    
1 rpart          <tibble [1 × 4]> <opts[1]> <list [0]>
2 ranger         <tibble [1 × 4]> <opts[1]> <list [0]>
3 xgboost        <tibble [1 × 4]> <opts[1]> <list [0]>
4 bist_cubist    <tibble [1 × 4]> <opts[0]> <list [0]>
5 ansformed_mars <tibble [1 × 4]> <opts[0]> <list [0]>
6 ansformed_knn  <tibble [1 × 4]> <opts[0]> <list [0]>
Show and hide code
basic |>
  split(~wflow_id) |>
  map(
    \(x) extract_parameter_set_dials(
      x = x,
      id = x$wflow_id
    ) |>
      _$object
  )
$ansformed_knn
$ansformed_knn[[1]]


$ansformed_mars
$ansformed_mars[[1]]

$ansformed_mars[[2]]


$bist_cubist
$bist_cubist[[1]]

$bist_cubist[[2]]


$ranger
$ranger[[1]]

$ranger[[2]]

$ranger[[3]]


$rpart
$rpart[[1]]

$rpart[[2]]


$xgboost
$xgboost[[1]]

$xgboost[[2]]

$xgboost[[3]]

$xgboost[[4]]

$xgboost[[5]]

$xgboost[[6]]

10 Grid and tune

Show and hide code
race_ctrl <- control_race(
  save_pred = TRUE,
  parallel_over = "everything",
  save_workflow = FALSE,
  verbose = TRUE
)
Show and hide code
cores <- parallel::detectCores(logical = FALSE)
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)

race_models <- basic |> 
  filter(wflow_id %in% c("rpart", "ranger", "bist_cubist", "xgboost"))

time_race <- Sys.time()
race_result <- workflow_map(
  race_models,
  "tune_race_anova",
  seed = 111,
  resamples = trans_folds,
  grid = 50,
  control = race_ctrl,
  verbose = TRUE,
  metrics = metric_set(rmse, mae, rsq)
)

Sys.time() - time_race
Time difference of 6.352333 mins
Show and hide code
grid_ctrl <- control_grid(
  save_pred = TRUE,
  parallel_over = "everything"
)

# workflow_set już masz
grid_models <- workflow_set(
  preproc = list(transformed = basic_t_rec),
  models = list(
    mars = mars_spec,
    knn  = knn_spec
  )
)

# strojenie wszystkich workflowów naraz
grid_result <- workflow_map(
  grid_models,
  "tune_grid",
  resamples = trans_folds,
  grid = 20,
  metrics = metric_set(rmse, mae, rsq),
  control = grid_ctrl
)


stopCluster(cl)

save(race_result, grid_result, file = "tune_results_split.Rdata")

11 Best model selection

Show and hide code
load("tune_results_split.Rdata")
combined_results <- bind_rows(race_result, grid_result)

best_results <-
  combined_results |>
  split(~wflow_id) |>
  map(
    \(x)
      extract_workflow_set_result(x = x, id = x$wflow_id) |>
      select_best(metric = "rmse", )
  )

best_models <-
  combined_results |>
  split(~wflow_id) |>
  map(
    \(x) 
      extract_workflow(x = x, id = x$wflow_id) |>
      finalize_workflow(best_results[[x$wflow_id]]) |>
      last_fit(
        split = trans_split, 
        metrics = metric_set(rmse, rsq, mae)
        )
  )

save(best_models, file = "best_models.rdata")
Show and hide code
load("tune_result.Rdata")
load("best_models.rdata")

12 Result tune

Show and hide code
combined_results |>
  split(~wflow_id) |>
  map(
    \(x) 
      extract_workflow_set_result(x = x, id = x$wflow_id) |> 
      show_best(metric = "rmse", n = 1) |> 
      select(-n, -.metric, -.config)
  ) |>
  knitr::kable()
committees neighbors .estimator mean std_err
11 0 standard 2.600359 0.0076663
mtry trees min_n .estimator mean std_err
4 157 25 standard 2.654244 0.0081989
cost_complexity min_n .estimator mean std_err
0.0033932 17 standard 2.631114 0.0077048
neighbors .estimator mean std_err
15 standard 2.778166 0.0095855
num_terms prod_degree .estimator mean std_err
5 2 standard 2.649441 0.0077632
trees min_n tree_depth learn_rate loss_reduction sample_size .estimator mean std_err
344 23 4 1.037333 138.9495 0.5510204 standard 2.641582 0.0078835
Show and hide code
combined_results |> 
  rank_results(select_best = T) |> 
  unite("rate", c("mean", "std_err"), sep = "/") |> 
  pivot_wider(names_from = .metric, values_from = rate) |>
  separate_wider_delim(
    cols = mae:rsq, 
    delim = "/", 
    names = c("", "_std_err"), 
    names_sep = "") |> 
  select(-preprocessor, -n, -model) |>
  mutate(.config = str_sub(.config, 20, 30)) |> 
  mutate(across(
    .cols = mae:rsq_std_err, 
    .fns = \(x) signif(x = as.numeric(x), digits = 3)
  )) |> arrange(mae) |> 
  gt::gt() |> 
  gt::tab_header(title = "Wyniki oceny dla zestawu walidacyjnego")
Wyniki oceny dla zestawu walidacyjnego
wflow_id .config rank mae mae_std_err rmse rmse_std_err rsq rsq_std_err
bist_cubist 06 1 2.10 0.00967 2.60 0.00767 0.215 0.00518
rpart 42 2 2.15 0.00894 2.63 0.00770 0.198 0.00531
xgboost 37 3 2.19 0.00887 2.64 0.00788 0.193 0.00505
ranger 07 5 2.21 0.00877 2.65 0.00820 0.185 0.00496
transformed_mars 8 4 2.24 0.00787 2.65 0.00776 0.186 0.00516
transformed_knn 14 6 2.33 0.00926 2.78 0.00959 0.133 0.00450
Show and hide code
combined_results |>  
  rank_results(select_best = T) |>
  mutate(wflow_id = fct_reorder(wflow_id, rank)) |> 
  ggplot(aes(wflow_id, mean, colour = wflow_id)) +
  geom_point() +
  geom_errorbar(aes(ymin = mean - 1.96 * std_err,
                    ymax = mean + 1.96 * std_err), width = 0.8) +
  facet_wrap(~.metric, scales = "free_y") +
  theme_bw() + 
  labs(x = "model", 
       y = "value metric", 
       colour = "model") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Show and hide code
ggsave(filename = "rys. 3._valid_result_model.jpeg", device = "jpeg", 
       width = 8.5, height = 3, dpi = 300)
Show and hide code
all_fit_metrics <-
  combined_results |>
  split(~wflow_id) |>
  map(
    \(x)
    extract_workflow_set_result(x = x, id = x$wflow_id) |>
      _$.metrics |>
      _[[1]] |> 
      mutate(.config = str_sub(.config, 20, 24))
  )    

12.1 Ranger

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
    combined_results |> 
      extract_workflow_set_result(id = "ranger") |> 
      select_best(metric = x), .id = ".metric") |> 
  gt::gt()
.metric mtry trees min_n .config
1 4 157 25 Preprocessor1_Model07
2 4 157 25 Preprocessor1_Model07
3 4 157 25 Preprocessor1_Model07
Show and hide code
all_fit_metrics[["ranger"]] |>                              # Zmień model ...
  filter(.metric == "rmse") |>                               # Zmień statystykę rsq, mae 
  ggplot(aes(trees, .estimate, color = factor(min_n))) +
  geom_point() +
  facet_wrap(~mtry) +
  scale_x_continuous(limits = c(0, 600), breaks = seq(0, 600, 100)) +
  ggtitle(label = "ranger") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Najlepszy mtry = 4 300/400 drzew jest wystarczjaące Najniższe RMSE dla sporej liczby min_n - koło 25, drzewa płytsze i zgeneralizowane

Show and hide code
all_fit_metrics[["ranger"]] |> 
  filter(.metric == "rmse", .estimate < 2.74) |> 
  arrange(.estimate) |> 
  gt::gt()
mtry trees min_n .metric .estimator .estimate .config
4 157 25 rmse standard 2.735557 07
4 426 22 rmse standard 2.735890 05
4 279 24 rmse standard 2.737055 02

Najlepsze parametry z config 07

12.2 Cubist

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
    combined_results |> 
      extract_workflow_set_result(id = "bist_cubist") |> 
      select_best(metric = x), .id = ".metric") |> 
  gt::gt()
.metric committees neighbors .config
1 11 0 Preprocessor1_Model06
2 11 0 Preprocessor1_Model06
3 11 0 Preprocessor1_Model06
Show and hide code
all_fit_metrics[["bist_cubist"]] |>                               # Zmień model ...
  filter(.metric == "rmse") |>                               # Zmień statystykę rsq, mae 
  ggplot(aes(committees, .estimate, color = factor(neighbors))) +
  geom_point(position = position_jitter(width = 0.5, height = 0)) +
  facet_wrap(~neighbors, scales = "free_y") +
  scale_x_continuous(limits = c(0, 120), breaks = seq(0, 120, 20), expand = c(0, 0)) +
  ggtitle(label = "Cubist") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Najlepsze neighbors 0 i dla najwiekszych im zwiekszam tym sie rmse zmniejsza.

Show and hide code
all_fit_metrics[["bist_cubist"]] |>                               # Zmień model ...
  filter(.metric == "rmse") |>                               # Zmień statystykę rsq, mae 
  filter(between(neighbors, 7, 9)) |> 
  ggplot(aes(committees, .estimate, color = factor(neighbors))) +
  geom_point() +
  geom_line() +
  scale_x_continuous(limits = c(0, 120), breaks = seq(0, 120, 20), expand = c(0, 0)) +
  ggtitle(label = "Cubist")

Show and hide code
all_fit_metrics[["bist_cubist"]] |> 
  filter(.metric == "rmse", .estimate < 2.86) |> 
  arrange(.estimate) |> 
  gt::gt()
committees neighbors .metric .estimator .estimate .config
11 0 rmse standard 2.716503 06
37 0 rmse standard 2.717536 19
53 0 rmse standard 2.717673 27
77 0 rmse standard 2.717773 39
91 0 rmse standard 2.717807 46
9 9 rmse standard 2.858830 05
23 9 rmse standard 2.859291 12
47 9 rmse standard 2.859443 24
63 9 rmse standard 2.859481 32
89 9 rmse standard 2.859512 45

Dla małych neighbors dodatkowe komisje nie mają sensu, model jest już optymalny. Dla większych neighbors dodanie komisji nieznacznie zmienia RMSE, efekt jest minimalny.

12.3 Rpart

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
    combined_results |> 
      extract_workflow_set_result(id = "rpart") |> 
      select_best(metric = x) |>
      mutate(.metric = x)) |> 
  gt::gt()
cost_complexity min_n .config .metric
0.003393222 17 Preprocessor1_Model42 mae
0.003393222 17 Preprocessor1_Model42 rmse
0.003393222 17 Preprocessor1_Model42 rsq
Show and hide code
all_fit_metrics[["rpart"]] |>                                # Zmień model ...
  filter(.metric == "rmse") |>                               # Zmień statystykę rsq, mae 
  ggplot(aes(cost_complexity, .estimate, color = factor(min_n))) +
  geom_point() +
  geom_line() +
#  facet_wrap(~min_n) +
  scale_x_log10(breaks = breaks_log(n = 10, base = 10), labels = label_log(base = 10)) +
  labs(title =  "rpart", color = "min_n", y = "rmse")

Show and hide code
all_fit_metrics[["rpart"]] |>
  filter(.metric == "rmse") |>
  slice_min(.estimate, n = 3)
# A tibble: 4 × 6
  cost_complexity min_n .metric .estimator .estimate .config
            <dbl> <int> <chr>   <chr>          <dbl> <chr>  
1         0.00518    21 rmse    standard        2.72 43     
2         0.00339    17 rmse    standard        2.73 42     
3         0.00791    25 rmse    standard        2.74 44     
4         0.0121     28 rmse    standard        2.74 45     

12.4 xgboost

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
    combined_results |> 
      extract_workflow_set_result(id = "xgboost") |> 
      select_best(metric = x) |>
      mutate(.metric = x)) |> 
  gt::gt()
trees min_n tree_depth learn_rate loss_reduction sample_size .config .metric
344 23 4 1.037333 138.9495 0.5510204 Preprocessor1_Model37 mae
344 23 4 1.037333 138.9495 0.5510204 Preprocessor1_Model37 rmse
344 23 4 1.037333 138.9495 0.5510204 Preprocessor1_Model37 rsq
Show and hide code
all_fit_metrics[["xgboost"]] |>                                # Zmień model ...
  filter(.metric == "rmse") |>                               # Zmień statystykę rsq, mae 
  ggplot(aes(trees, .estimate, color = factor(min_n))) +
  geom_point() +
  facet_wrap(~tree_depth, scales = "free_y") +
  labs(title =  "xgboost", color = "min_n", y = "rmse")

rmse <2.8

Show and hide code
all_fit_metrics[["xgboost"]] |>                                # Zmień model ...
  filter(.metric == "rmse", .estimate < 2.8) |>                               # Zmień statystykę rsq, mae 
  ggplot(aes(trees, .estimate, color = factor(min_n))) +
  geom_point() +
  facet_wrap(~tree_depth) +
  labs(title =  "xgboost", color = "min_n", y = "rmse")

min_n 9 oraz depth 5

12.5 Transformed mars

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
      combined_results |> 
      extract_workflow_set_result(id = "transformed_mars") |> 
      select_best(metric = x), .id = ".metric") |> 
  gt::gt()
.metric num_terms prod_degree .config
1 5 2 Preprocessor1_Model8
2 5 2 Preprocessor1_Model8
3 5 2 Preprocessor1_Model8
Show and hide code
all_fit_metrics[["transformed_mars"]] |>                              
  filter(.metric == "rmse") |>                              
  ggplot(aes(num_terms, .estimate, color = factor(prod_degree))) + 
  geom_point() +
  geom_line() +
  facet_wrap(~prod_degree, scales = "free_y") +
  ggtitle(label = "transformed_mars") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

minimalne różnice

12.6 knn

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
      combined_results |> 
      extract_workflow_set_result(id = "transformed_knn") |> 
      select_best(metric = x), .id = ".metric") |> 
  gt::gt()
.metric neighbors .config
1 15 Preprocessor1_Model14
2 15 Preprocessor1_Model14
3 15 Preprocessor1_Model14
Show and hide code
colnames(all_fit_metrics[["transformed_knn"]])
[1] "neighbors"  ".metric"    ".estimator" ".estimate"  ".config"   
Show and hide code
all_fit_metrics[["transformed_knn"]] |>                              
  filter(.metric == "rmse") |>                              
  ggplot(aes(neighbors, .estimate, color = factor(neighbors))) +  
  geom_point() +
  geom_line(aes(group = 1)) + 
  ggtitle(label = "transformed_knn") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

RMSE maleje wraz ze wzrostem sasiadow

Show and hide code
all_fit_metrics[["transformed_knn"]] |> 
  filter(.metric == "rmse", .estimate < 2.87) |> 
  arrange(.estimate) |> 
  gt::gt()
neighbors .metric .estimator .estimate .config
15 rmse standard 2.821374 14
14 rmse standard 2.833143 13
12 rmse standard 2.860002 12

Każda z opcji jest dobra - małe różnice

13 Result best models

Show and hide code
p <-
  best_models |>
  map_dfr(.f = \(x) collect_metrics(x = x), .id = "mod") |>
  mutate(mod = factor(mod, levels = c("rpart", "ranger", "bist_cubist", "xgboost", "transformed_mars", "transformed_knn"))) |>
  ggplot(aes(mod, .estimate, colour = mod)) +
  geom_point() +
  facet_wrap(~.metric, scales = "free_y") +
  theme_bw() +
  labs(
    x = "Model",
    y = "Wartosc metryki",
    colour = "Model"
  ) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p

Show and hide code
ggsave(
  filename = "rys_4_test_result_model.jpeg", device = "jpeg",
  width = 8.5, height = 3, dpi = 300
)

Testowy vs walidacyjny

Show and hide code
por_test_valid <-
  bind_rows(
  best_models |>
      map_dfr(\(x) collect_metrics(x = x), .id = "mod") |>
      select(mod, .metric, .estimate) |>
      rename(mean = .estimate) |>
      mutate(typ = "testowy"),
    combined_results |>
      rank_results(select_best = T) |>
      select(wflow_id, .metric, mean) |>
      rename(mod = wflow_id) |>
      mutate(typ = "walidacyjny")
  ) |>
  pivot_wider(
    names_from = typ,
    values_from = mean
  )

por_test_valid |>
  gt::gt()  |> 
  gt::fmt_number(n_sigfig = 3)
mod .metric testowy walidacyjny
bist_cubist rmse 2.62 2.60
bist_cubist rsq 0.226 0.215
bist_cubist mae 2.11 2.10
ranger rmse 2.66 2.65
ranger rsq 0.203 0.185
ranger mae 2.22 2.21
rpart rmse 2.64 2.63
rpart rsq 0.212 0.198
rpart mae 2.13 2.15
transformed_knn rmse 2.78 2.78
transformed_knn rsq 0.148 0.133
transformed_knn mae 2.34 2.33
transformed_mars rmse 2.69 2.65
transformed_mars rsq 0.182 0.186
transformed_mars mae 2.28 2.24
xgboost rmse 2.64 2.64
xgboost rsq 0.216 0.193
xgboost mae 2.20 2.19
Show and hide code
por_test_valid |>
  filter(.metric == "rmse") |> 
  ggplot(aes(walidacyjny, testowy, colour = mod)) +
  geom_point(size = 4) +
  theme_bw() +
  geom_abline(slope = 1) +
  labs(
    x = "Walidacyjny",
    y = "Testowy",
    colour = "Model"
  ) +
  coord_fixed(ratio = 1) +
  xlim(2.5, 3) + ylim(2.5, 3)

Show and hide code
por_test_valid |>
  filter(.metric == "rsq") |> 
  ggplot(aes(walidacyjny, testowy, colour = mod)) +
  geom_point(size = 4) +
  theme_bw() +
  geom_abline(slope = 1) +
  labs(
    x = "Walidacyjny",
    y = "Testowy",
    colour = "Model"
  ) +
  coord_fixed(ratio = 1) +
  expand_limits(x = c(0,0.25), y = c(0,0.25)) +
  scale_x_continuous(expand = c(0,0), breaks = seq(0, 0.25, 0.05)) +
  scale_y_continuous(expand = c(0,0), breaks = seq(0, 0.25, 0.05)) 

Show and hide code
por_test_valid |>
  filter(.metric == "mae") |> 
  ggplot(aes(walidacyjny, testowy, colour = mod)) +
  geom_point(size = 4) +
  theme_bw() +
  geom_abline(slope = 1) +
  labs(
    x = "Walidacyjny",
    y = "Testowy",
    colour = "Model"
  ) +
  coord_fixed(ratio = 1) +
  expand_limits(x = c(0,3), y = c(0,3)) +
  scale_x_continuous(expand = c(0,0), breaks = seq(0, 3, 0.5)) +
  scale_y_continuous(expand = c(0,0), breaks = seq(0, 3, 0.5)) 

Wykres rozrzutu

Show and hide code
my_breaks <- c(1, 4, 9, 16, 25, 36, 49, 64, 81, 100)

range_axis_x_y <-
  best_models |>
  map_dfr(
    \(x) augment(x = x),
    .id = "mod"
  ) |>
  summarise(max = max(max(total_price), max(.pred))) |>
  pull()

best_models |>
  map_dfr(
    \(x) augment(x = x),
    .id = "mod"
  ) |>
  ggplot(aes(total_price, .pred)) +
  geom_hex() +
  facet_wrap(~mod) +
  geom_abline(slope = rep(c(1.5, 0.5, 1), 4)) +
  theme_bw() +
  viridis::scale_fill_viridis(
    breaks = my_breaks,
    labels = my_breaks,
    option = "D",
    direction = 1,
    trans = scales::pseudo_log_trans(sigma = 1)
  ) +
  scale_x_continuous(expand = c(0, 0), limits = c(0, range_axis_x_y)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, range_axis_x_y)) +
  coord_obs_pred() +
  labs(
    x = expression("Cena całkowita"),
    y = expression("Cena całkowita"),
    fill = "Czestosc", title = "Ocena na zestawie testowym"
  )

Show and hide code
ggsave(
  filename = "fig_2_Best_models_for_test_data.jpeg",
  device = "jpeg",
  width = 8.00,
  height = 4.60,
  dpi = 300
)
Show and hide code
best_models |>
  map_dfr(
    \(x) augment(x = x),
    .id = "mod"
  ) |>
  openair::modStats(mod = ".pred", obs = "total_price", type = "mod") |>
  arrange(FAC2) |>
  select(-P) |>
  gt::gt() |>
  gt::fmt_number(columns = FAC2:IOA, n_sigfig = 3)
mod n FAC2 MB MGE NMB NMGE RMSE r COE IOA
transformed_mars 1001 0.820 0.0229 2.28 0.00371 0.369 2.69 0.426 0.0800 0.540
transformed_knn 1001 0.827 0.00347 2.34 0.000563 0.380 2.78 0.385 0.0545 0.527
xgboost 1001 0.831 0.0176 2.20 0.00286 0.357 2.64 0.465 0.112 0.556
ranger 1001 0.832 0.00970 2.22 0.00157 0.361 2.66 0.451 0.102 0.551
rpart 1001 0.889 0.00271 2.13 0.000439 0.345 2.64 0.461 0.140 0.570
bist_cubist 1001 0.971 −0.0249 2.11 −0.00404 0.342 2.62 0.475 0.148 0.574

Najniższe RMSE: cubist Najwyższe r: cubist Najwyższe IOA: cubist ( Index of Agreement based on Willmott et al. (2011), which spans between -1 and +1 with values approaching +1 representing better model performance) Najwyższe FAC2: dla cubist wyjątkowo dobre oszacowanie, bliskie 1.

Cubist wypadł najlepiej, modele drzew również dobrze sobie radzą, modele transformacyjne działają słabiej. Top3: cubist, rpart, xgboost.